# code/stage2_issp.py

import logging
from pathlib import Path
import pandas as pd
from helpers import (
    final_clean_numeric_robust,
    clean_spss_categorical_codes,
    get_alpha3_from_numeric,
    standardize_alpha_code
)
from paths import ISSP_FILES, PROCESSED_DATA_DIR

processing_configs = {
    1988: {
        'filename': 'ZA1700_2005-11-24.sav',
        'map_type': 'alpha',
        'vars': {
            'CASEID': {'source': 'V2'},
            'COUNTRY_ALPHA': {'source': 'V3'},
            'SEX': {'source': 'V65', 'recode': {0: 1, 1: 2}},
            'AGE': {'source': 'V66', 'missing': [99], 'zero_missing': True},
            'EDUC_LEVEL': {'source': 'V5', 'reverse': False},
            'EGAL': {'source': 'V6', 'reverse': False},
        }
    },
    1994: {
        'filename': 'ZA2620_2005-12-14.sav',
        'map_type': 'extract_alpha',
        'vars': {
            'CASEID': {'source': 'v2'},
            'COUNTRY_NUM': {'source': 'v3'},
            'SEX': {'source': 'v200', 'recode': {0: 1, 1: 2}},
            'AGE': {'source': 'v201', 'missing': [99], 'zero_missing': True},
            'EDUC_LEVEL': {'source': 'v204'},
            'EGAL': {'source': 'v5', 'reverse': False},
        }
    },
    2002: {
        'filename': 'ZA3880_v1-1-0.sav',
        'map_type': 'alpha',
        'vars': {
            'CASEID': {'source': 'v3'},
            'COUNTRY_ALPHA': {'source': 'C_ALPHAN'},
            'SEX': {'source': 'v200', 'recode': {0: 1, 1: 2}},
            'AGE': {'source': 'v201', 'missing': [999], 'zero_missing': True},
            'EDUC_LEVEL': {'source': 'v204'},
            'EGAL': {'source': 'v5', 'reverse': False},
        }
    },
    2012: {
        'filename': 'ZA5900_v4-0-0.sav',
        'map_type': 'numeric',
        'vars': {
            'CASEID': {'source': 'CASEID'},
            'COUNTRY_NUM': {'source': 'V4'},
            'SEX': {'source': 'SEX'},
            'AGE': {'source': 'AGE', 'missing': [0, 998, 999]},
            'EDUC_LEVEL': {'source': 'EDUCYRS'},
            'EGAL': {'source': 'V5', 'reverse': False},
        }
    }
}

def process_wave(year, cfg):
    path = Path(ISSP_FILES.root) / cfg['filename']
    df = pd.read_spss(path, convert_categoricals=True)
    out = pd.DataFrame()
    out['YEAR'] = year
    vm = cfg['vars']
    out['CASEID'] = final_clean_numeric_robust(df[vm['CASEID']['source']])
    if cfg['map_type'] == 'numeric':
        num = final_clean_numeric_robust(df[vm['COUNTRY_NUM']['source']])
        out['C_ALPHAN'] = num.apply(get_alpha3_from_numeric)
    elif cfg['map_type'] == 'alpha':
        raw = df[vm['COUNTRY_ALPHA']['source']].astype(str)
        out['C_ALPHAN'] = raw.apply(standardize_alpha_code)
    else:
        raw = df[vm['COUNTRY_NUM']['source']].astype(str)
        out['C_ALPHAN'] = raw.str[:3].apply(standardize_alpha_code)
    out['SEX'] = clean_spss_categorical_codes(df[vm['SEX']['source']]).map(vm['SEX'].get('recode', {}))
    age_cfg = vm['AGE']
    age = final_clean_numeric_robust(df[age_cfg['source']], age_cfg.get('missing'), age_cfg.get('zero_missing', False))
    out['AGE'] = age
    out['educ_level'] = clean_spss_categorical_codes(df[vm['EDUC_LEVEL']['source']])
    egal_cfg = vm['EGAL']
    egl = clean_spss_categorical_codes(df[egal_cfg['source']])
    if egal_cfg.get('reverse', False):
        egl = 6 - egl
    out['egal_index'] = egl
    out['Source'] = f'ISSP_{year}'
    return out

dfs = []
for yr, cfg in processing_configs.items():
    try:
        dfw = process_wave(yr, cfg)
        dfs.append(dfw)
        logging.info(f'Wave {yr} processed, shape={dfw.shape}')
    except Exception as e:
        logging.error(f'Error processing {yr}: {e}')

issp_combined = pd.concat(dfs, ignore_index=True)
issp_combined.to_parquet(PROCESSED_DATA_DIR / 'issp_combined.parquet', index=False)
logging.info('ISSP combined saved')
